Systematic Associations Between Clinical Phenotypes and Environmental Exposures: Benchmarking Exposomic Research

if(exists("con")) {
  dbDisconnect(con)
  remove(list=ls())
}
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.3.6     ✔ purrr   0.3.4
## ✔ tibble  3.1.8     ✔ dplyr   1.0.9
## ✔ tidyr   1.2.0     ✔ stringr 1.4.0
## ✔ readr   2.1.2     ✔ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
library(DBI)
library(ggsci)
library(DT)
library(ggrepel)
library(cowplot)
source('../quantpe.R')
## 
## Attaching package: 'Matrix'
## 
## The following objects are masked from 'package:tidyr':
## 
##     expand, pack, unpack
## 
## 
## Attaching package: 'survey'
## 
## The following object is masked from 'package:graphics':
## 
##     dotchart
ep_assoc_summary_across_models <- function(summary_stats, glanced_stats) {
  summary_stats_wide <- summary_stats |> pivot_wider(names_from = "model_type", values_from = c("estimate", "std.error", "statistic", "p.value")) 
  summary_stats_wide <- summary_stats_wide |> mutate(estimate_diff = estimate_adjusted-estimate_unadjusted)
  adj_vs_base <- glanced |> select(-c(adj.r2, df.residual, null.deviance, df.null, deviance)) |> pivot_wider(names_from=model_type, values_from = c("rsq", "nobs", "AIC", "BIC"))
  adj_vs_base <- adj_vs_base |> mutate(rsq_adjusted_base_diff=rsq_adjusted-rsq_base, rsq_adjusted_diff = rsq_adjusted-rsq_unadjusted)
  summary_stats_wide |> left_join(adj_vs_base, by=c("evarname", "pvarname", "exposure_table_name", "phenotype_table_name"))
}
con <- DBI::dbConnect(RSQLite::SQLite(), dbname='../pe_summary_stats.sqlite')
varnames <- tbl(con, "variable_names_epcf")
adjusted_meta <- tbl(con, "adjusted_meta")
unadjusted_meta <- tbl(con, "unadjusted_meta")
adjusted_meta <- adjusted_meta |> left_join(unadjusted_meta |> select(evarname, pvarname, expo_name, vartype, estimate, p.value) |> rename(estimate_unadjusted=estimate, p.value_unadjusted=p.value), by=c("evarname", "pvarname", "expo_name", "vartype"))
mvr2 <- tbl(con, 'mvr2') |> mutate(mv = mve_rsq-base_rsq)
pe <- tbl(con, "pe")
glanced <- tbl(con, "glanced")
variable_domain <- tbl(con, "variable_domain")
expos <- pe |> filter(term %like% 'expo%')
expos_wide <- ep_assoc_summary_across_models(expos, glanced)
expos_wide <- expos_wide |> left_join(varnames, by=c("evarname"="Variable.Name", "exposure_table_name"="Data.File.Name"))
expos_wide <- expos_wide |> left_join(varnames |> select(Variable.Name, Data.File.Name, Variable.Description, Data.File.Description), 
                                      by=c("pvarname"="Variable.Name", "phenotype_table_name"="Data.File.Name"))

expos_wide <- expos_wide |> select(-Use.Constraints) |> rename(e_data_file_desc=Data.File.Description.x, p_data_file_desc=Data.File.Description.y,
                                                               e_variable_description=Variable.Description.x, 
                                                               p_variable_description=Variable.Description.y
                                                               )
expos_wide <- expos_wide |> collect()
expos_wide_summary <- expos_wide |> filter(term == 'expo' | term == 'expo1') |> group_by(evarname, pvarname) |> summarize(mean_adjusted_base_r2_diff = mean(rsq_adjusted_base_diff), mean_unadjusted_r2_diff=mean(rsq_adjusted_diff), total_n = sum(nobs_adjusted)) |> ungroup()
## `summarise()` has grouped output by 'evarname'. You can override using the
## `.groups` argument.
adjusted_meta <- adjusted_meta |> collect() |> left_join(expos_wide_summary, by=c("evarname", "pvarname"))


p_variable_domain <- variable_domain |> filter(epcf == 'p') |> collect() |> group_by(Variable.Name) |> summarise(pvardesc=first(Variable.Description),pcategory=first(category),psubcategory=first(subcategory))
e_variable_domain <- variable_domain |> filter(epcf == 'e') |> collect() |> group_by(Variable.Name) |> summarise(evardesc=first(Variable.Description),ecategory=first(category),esubcategory=first(subcategory))

adjusted_meta <- adjusted_meta |> left_join(p_variable_domain, by=c("pvarname"="Variable.Name"))
adjusted_meta <- adjusted_meta |> left_join(e_variable_domain, by=c("evarname"="Variable.Name"))

expos_wide <- expos_wide |> left_join(p_variable_domain, by=c("pvarname"="Variable.Name"))
expos_wide <- expos_wide |> left_join(e_variable_domain, by=c("evarname"="Variable.Name"))

Number of unique exposures and phenotypes

num_e <- length(unique(adjusted_meta$evarname))
num_p <- length(unique(adjusted_meta$pvarname))
num_e
## [1] 859
num_p
## [1] 319
num_e * num_p
## [1] 274021

Number exposures and phenotypes and associations in X number of surveys

#adjusted_meta <- adjusted_meta |> unnest(glanced) |> unnest(tidied)
n_obss <- sort(unique(adjusted_meta$nobs))

num_tests <- map_df(n_obss, function(n) {
  n_e <- adjusted_meta |> filter(nobs == n) |> pull(evarname) |> unique() |> length()
  n_p <- adjusted_meta |> filter(nobs == n) |> pull(pvarname) |> unique() |> length()
  nn <- nrow(adjusted_meta |> filter(nobs == n))
  tibble(n_expos=n_e, n_phenos=n_p, n_pxe=nn)
})

num_tests  |> mutate(n_surveys=n_obss) |> print(n=11)
## # A tibble: 10 × 4
##    n_expos n_phenos n_pxe n_surveys
##      <int>    <int> <int>     <int>
##  1     853      319 58906         1
##  2     650      278 38251         2
##  3     519      241 21600         3
##  4     411      224 36138         4
##  5     319      216  9160         5
##  6     326       82  8794         6
##  7     217       75  1827         7
##  8     196       64  9423         8
##  9      38       60  1051         9
## 10      27       54  1573        10

Keep number of surveys is greater than 2

adjusted_meta_2 <- adjusted_meta |> filter(nobs >= 2)

Sample sizes within and across all surveys

p <- ggplot(expos_wide, aes(factor(Begin.Year), nobs_adjusted))
p <- p + geom_boxplot() + ylab("Sample Sizes for P-E association") + xlab("Survey Year")
p

sample_size_per_pair <- expos_wide |> filter(term == 'expo' | term== 'expo1') |> group_by(evarname, pvarname) |> summarize(total_n=sum(nobs_adjusted), n_surveys=n(), median_n=median(nobs_adjusted))
## `summarise()` has grouped output by 'evarname'. You can override using the
## `.groups` argument.

Summary of the summary stats

adjusted_meta_2 <- adjusted_meta_2 |> ungroup() |>  mutate(pval_BY=p.adjust(p.value, method="BY"), pvalue_bonferroni=p.adjust(p.value, method="bonferroni"))
adjusted_meta_2 <- adjusted_meta_2 |> mutate(sig_levels = case_when(
  pvalue_bonferroni < 0.05 ~ 'Bonf.<0.05',
  pval_BY < 0.05 ~ 'BY<0.05',
  TRUE ~ '> BY & Bonf.'
))

bonf_thresh <- 0.05/nrow(adjusted_meta_2)
quantile(adjusted_meta_2$p.value, probs=c(0.01, .05, .1, .2, .3, .4, .5, .6, .7, .8, .9, .95, .99), na.rm = T)
##           1%           5%          10%          20%          30%          40% 
## 2.553508e-27 3.398496e-10 3.332952e-06 1.761850e-03 2.141903e-02 8.095805e-02 
##          50%          60%          70%          80%          90%          95% 
## 1.826276e-01 3.179099e-01 4.734236e-01 6.422462e-01 8.190745e-01 9.093611e-01 
##          99% 
## 9.816266e-01
quantile(adjusted_meta_2$estimate, probs=c(0.01, .05, .1, .2, .3, .4, .5, .6, .7, .8, .9, .95, .99), na.rm = T)
##           1%           5%          10%          20%          30%          40% 
## -0.203919498 -0.075944258 -0.046650432 -0.023478563 -0.012016041 -0.003777513 
##          50%          60%          70%          80%          90%          95% 
##  0.003753614  0.011768619  0.021248856  0.033866834  0.057029843  0.090201321 
##          99% 
##  0.287943341
sum(adjusted_meta_2$pvalue_bonferroni < 0.05)/nrow(adjusted_meta_2) 
## [1] 0.08294671

Association Size vs. -log10(pvalue)

e_summary <- adjusted_meta_2 |> group_by(evarname) |> arrange(pvalue_bonferroni) |>  
  summarize(mean_r2=mean(mean_adjusted_base_r2_diff, na.rm=T),  mean_estimate=mean(abs(estimate), na.rm=T), 
            median_pvalue=median(p.value, na.rm=T), n_sig=sum(pvalue_bonferroni < 0.05, na.rm=T), 
            n_tests=sum(!is.na(pvalue_bonferroni)),  median_i.squared=median(i.squared, na.rm=T),
            max_r2=first(mean_adjusted_base_r2_diff), max_pvarname=first(pvarname) , max_estimate=first(estimate), max_p.value=first(p.value)) |> mutate(n_sig_pct=n_sig/n_tests)


p_summary <- adjusted_meta_2 |> group_by(pvarname) |> arrange(pvalue_bonferroni) |> 
  summarize(mean_r2=mean(mean_adjusted_base_r2_diff, na.rm=T), mean_estimate=mean(abs(estimate), na.rm=T),
            median_pvalue=median(p.value, na.rm=T), n_sig=sum(pvalue_bonferroni < 0.05, na.rm=T), 
            n_tests=sum(!is.na(pvalue_bonferroni)),  median_i.squared=median(i.squared, na.rm=T),
            max_r2=first(mean_adjusted_base_r2_diff), max_evarname=first(evarname) , max_estimate=first(estimate), max_p.value=first(p.value)) |> mutate(n_sig_pct=n_sig/n_tests)

## deeper summary by group

p_group_summary <- adjusted_meta_2 |> unite(p_scategory, c(pcategory, psubcategory)) |> group_by(p_scategory) |> arrange(pvalue_bonferroni) |>  
  summarize(mean_r2=mean(mean_adjusted_base_r2_diff, na.rm=T),  mean_estimate=mean(abs(estimate), na.rm=T), 
            median_pvalue=median(p.value, na.rm=T), n_sig=sum(pvalue_bonferroni < 0.05, na.rm=T), 
            n_tests=sum(!is.na(pvalue_bonferroni)),  median_i.squared=median(i.squared, na.rm=T),
            max_r2=first(mean_adjusted_base_r2_diff), max_evarname=first(evarname) , max_estimate=first(estimate), max_p.value=first(p.value)) |> mutate(n_sig_pct=n_sig/n_tests)


e_group_summary <- adjusted_meta_2 |> unite(e_scategory, c(ecategory, esubcategory)) |> group_by(e_scategory) |> arrange(pvalue_bonferroni) |>  
  summarize(mean_r2=mean(mean_adjusted_base_r2_diff, na.rm=T),  
            mean_abs_estimate=mean(abs(estimate), na.rm=T),
            mean_estimate=mean((estimate), na.rm=T),
            median_pvalue=median(p.value, na.rm=T), 
            n_sig=sum(pvalue_bonferroni < 0.05, na.rm=T), 
            n_tests=sum(!is.na(pvalue_bonferroni)),  
            median_i.squared=median(i.squared, na.rm=T),
            max_r2=first(mean_adjusted_base_r2_diff), 
            max_pvarname=first(pvarname), 
            max_estimate=first(estimate), max_p.value=first(p.value)) |> mutate(n_sig_pct=n_sig/n_tests)
adjusted_meta_2 <- adjusted_meta_2 |> mutate(p_cap = ifelse(p.value < 1e-30, 1e-30, p.value))
p <- ggplot(adjusted_meta_2 |> filter(ecategory != 'autoantibody'), aes(estimate, -log10(p_cap)))
p <- p + geom_point(shape='.') + scale_x_continuous(limits=c(-1, 1))
p <- p + facet_grid(ecategory ~ .) + scale_color_npg()
p <- p + geom_hline(yintercept = -log10(.05/nrow(adjusted_meta_2)), color='lightblue')
p <- p + theme_minimal() + theme(legend.position = "none") +ylab('p.value') + xlab("estimate")
p
## Warning: Removed 73 rows containing missing values (geom_point).

adjusted_meta_2 <- adjusted_meta_2 |> mutate(p_cap = ifelse(p.value < 1e-30, 1e-30, p.value))
p <- ggplot(adjusted_meta_2 |> filter(pcategory == 'microbiome'), aes(estimate, -log10(p_cap)))
p <- p + geom_point(shape='.') + scale_x_continuous(limits=c(-1, 1))
p <- p + facet_grid(ecategory ~ .) + scale_color_npg()
p <- p + geom_hline(yintercept = -log10(.05/nrow(adjusted_meta_2)), color='lightblue')
p <- p + theme_minimal() + theme(legend.position = "none") +ylab('p.value') + xlab("estimate")
p
## Warning: Removed 5 rows containing missing values (geom_point).

e_effect_sizes_per <- adjusted_meta_2 |> filter(sig_levels == 'Bonf.<0.05') |> group_by(ecategory, esubcategory, sign(estimate)) |> summarize(median_pvalue=median(p.value), number_signficant=n(), mean_estimate=mean((estimate))) |> arrange(-mean_estimate)
## `summarise()` has grouped output by 'ecategory', 'esubcategory'. You can
## override using the `.groups` argument.
e_effect_sizes_per <- e_effect_sizes_per |> mutate(esubcategory = ifelse(is.na(esubcategory), ecategory, esubcategory))
p <- ggplot(e_effect_sizes_per, aes(mean_estimate, -log10(median_pvalue), label=esubcategory))
p <- p + geom_point(aes(size=number_signficant)) + geom_text_repel() + geom_vline(xintercept = 0)
p <- p + theme_bw() + xlab("Average(Estimate) within exposome groups") + ylab("Median log10(pvalue)")
p <- p + theme(legend.position = "bottom")
p
## Warning: ggrepel: 14 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps

p_effect_sizes_per <- adjusted_meta_2 |> filter(sig_levels == 'Bonf.<0.05') |> group_by(pcategory, psubcategory) |> summarize(mean_r2 = mean(mean_adjusted_base_r2_diff, na.rm=T))
## `summarise()` has grouped output by 'pcategory'. You can override using the
## `.groups` argument.
p <- ggplot(adjusted_meta_2 |> filter(vartype =='categorical'), aes(abs(estimate), color=sig_levels))
p <- p + stat_ecdf() + scale_x_continuous(limits=c(0, .25))
p <- p + xlab("abs(estimate)") + ylab("percentile")  + theme(legend.position="bottom") + scale_color_npg()
p
## Warning: Removed 2210 rows containing non-finite values (stat_ecdf).

p <- ggplot(adjusted_meta_2 |> filter(vartype =='continuous'), aes(abs(estimate), color=sig_levels))
p <- p + stat_ecdf() + scale_x_continuous(limits=c(0, .25))
p <- p + xlab("abs(estimate)") + ylab("percentile") + theme(legend.position="bottom") + scale_color_npg()
p
## Warning: Removed 263 rows containing non-finite values (stat_ecdf).

p <- ggplot(adjusted_meta_2, aes(abs(estimate), color=sig_levels))
p <- p + stat_ecdf() + scale_x_continuous(limits=c(0, .25))
p <- p + xlab("abs(estimate)") + ylab("percentile") + theme(legend.position="bottom") + scale_color_npg()
p
## Warning: Removed 2479 rows containing non-finite values (stat_ecdf).

ecdf_for_sig <- adjusted_meta_2 |> filter(sig_levels == 'Bonf.<0.05') |> pull(mean_adjusted_base_r2_diff) |> ecdf()
p_effect_sizes_per <- p_effect_sizes_per |> mutate(q = ecdf_for_sig(mean_r2), sig_levels ='Bonf.<0.05')
p_effect_sizes_per <- p_effect_sizes_per |> mutate(p_cat = ifelse(is.na(psubcategory), pcategory, psubcategory))
p <- ggplot(adjusted_meta_2, aes(mean_adjusted_base_r2_diff, color=sig_levels))
p <- p + stat_ecdf() + scale_x_continuous(limits=c(0, .05)) +scale_color_aaas()
p <- p + geom_point(data=p_effect_sizes_per, aes(x=mean_r2, y = q, color=sig_levels)) 
p <- p + geom_text_repel(data=p_effect_sizes_per, aes(x=mean_r2, y = q, color=sig_levels, label=p_cat)) 
p <- p + xlab("R^2 (adjusted-base model)") + ylab("percentile") 
p <- p + theme_bw() + theme(legend.position="bottom") 
p
## Warning: Removed 14458 rows containing non-finite values (stat_ecdf).
## Warning: ggrepel: 6 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps

Association sizes for all exposures

contextualizing all exposures

adjusted_meta_2 <- adjusted_meta_2 |> mutate(evarname = fct_reorder(evarname, abs(estimate), mean))
#p <- ggplot(adjusted_meta_2, aes(x=evarname, y=estimate, group=evarname))
#p <- p + geom_density_ridges()
#p <- p + geom_boxplot() + facet_grid(~sig_levels)
#p

Replicability and Consistency

I2 across number of surveys for P-E pair

p <- ggplot(adjusted_meta_2, aes(factor(nobs), i.squared,color=sig_levels))
p <- p + geom_boxplot() + xlab("Number of Surveys for PE Association") + ylab("I^2") + theme(legend.position="bottom") + scale_color_aaas()
p <- p + theme_bw() + theme(legend.position="bottom") 
p

Reverse meta-analysis: replicability of P-E

within_survey_pvalue_threshold <- 0.05 # should this be changed for each survey?
p_val_pe_pair <- expos_wide |> group_by(evarname, pvarname, term) |> summarize(n_pvalue_lt=sum(p.value_adjusted<within_survey_pvalue_threshold), total_n=n(), pct_pvalue_lt=n_pvalue_lt/total_n)
## `summarise()` has grouped output by 'evarname', 'pvarname'. You can override
## using the `.groups` argument.
adjusted_meta_3 <- adjusted_meta_2 |> left_join(p_val_pe_pair, by=c("evarname"="evarname", "pvarname"="pvarname", "expo_name"="term"))

Showcasing associations:

adjusted_meta_3 |> filter(pvalue_bonferroni < 0.05) |> nrow() 
## [1] 10602
adjusted_meta_3 |> filter(pvalue_bonferroni < 0.05, n_pvalue_lt >= 2) |> nrow()
## [1] 5048
non_het_pairs <- adjusted_meta_3 |> filter(pvalue_bonferroni < 0.05, n_pvalue_lt >= 2, i.squared < 50, mean_adjusted_base_r2_diff > .025)
het_pairs <- adjusted_meta_3 |> filter(pvalue_bonferroni < 0.05, n_pvalue_lt >= 2, i.squared > 50, mean_adjusted_base_r2_diff > .025, nobs >= 4)
#het_pairs_2 <- adjusted_meta_3 |> filter(sig_levels == 'BY<0.05', i.squared > 90, nobs >= 4) 

adjusted_meta_3 |> filter(pvalue_bonferroni < 0.05) |> group_by(ecategory) |> count()
## # A tibble: 8 × 2
## # Groups:   ecategory [8]
##   ecategory             n
##   <chr>             <int>
## 1 allergy               5
## 2 housing              40
## 3 income               10
## 4 infection           286
## 5 nutrients          6011
## 6 physical activity   740
## 7 pollutant          3236
## 8 smoking             274
#super_filtered <- adjusted_meta_3 |> filter(pvalue_bonferroni < 0.05, n_pvalue_lt >= 2, i.squared < .5, mean_adjusted_base_r2_diff > .005)
#super_filtered |> group_by(evarname) |> count() |> arrange(-n)
#super_filtered |> group_by(pvarname) |> count() |> arrange(-n)

## non-heterogeneous example

plot_pair <- function(evarname_str, pvarname_str, estimate_limits=c(0.01,.35)) {
  test_1 <- expos_wide |> filter(evarname == evarname_str, pvarname == pvarname_str) |> select(Begin.Year,  exposure_table_name, phenotype_table_name, e_variable_description, p_variable_description, estimate_adjusted, std.error_adjusted, p.value_adjusted) 
  exposure_name <- test_1$e_variable_description[1]
  phenotype_name <- test_1$p_variable_description[1]
  test_1 <- test_1 |> select(Begin.Year, estimate_adjusted, std.error_adjusted) |> rename(estimate=estimate_adjusted, std.error = std.error_adjusted, Survey=Begin.Year)
meta_test_1 <- adjusted_meta_2 |> filter(evarname == evarname_str, pvarname == pvarname_str) |> mutate(Survey = 'overall') |> select(Survey, estimate, std.error)
  test_1 <- test_1 |> rbind(meta_test_1) 
  test_1 <- test_1 |> mutate(point_shape = ifelse(Survey == 'overall', 23, 21)) |> mutate(point_shape = as.integer(point_shape)) 
  test_1 <- test_1 |> mutate(point_size = ifelse(Survey == 'overall', 7, 2)) 
  p <- ggplot(test_1, aes(Survey, estimate))
  p <- p + geom_point(aes(shape=point_shape, size=point_size, fill=point_shape)) + scale_shape_identity() + scale_size_identity()
  p <- p + geom_errorbar(aes(ymin=estimate-1.96*std.error, ymax=estimate+1.96*std.error), width=.1) + scale_x_discrete(limits=rev)
  p <- p + scale_y_continuous(limits=estimate_limits)
  p <- p + coord_flip()  + theme_bw() + theme(legend.position = "none")
  p <- p + ggtitle(sprintf('scale(%s)-scale(log10(%s))', phenotype_name, exposure_name))+ theme(plot.title = element_text(size = 7))
}

## non-heterogeneous example
p1 <- plot_pair('URXP01', 'LBDNENO')
## heterogeneous example
p2 <- plot_pair('LBXGTC', 'BMXBMI')

p3 <- plot_pair('LBXBPB', 'BMXHT', c()) # 33% i2

p4 <- plot_pair('LBXCOT', 'BPXPLS', c()) # 33% i2



#expos_wide |> filter(evarname == 'LBXPFOS', pvarname == 'LBXSAL') 
plot_grid(p1, p2, p3, p4, ncol=2,labels = c('A', 'B', "C", "D"), label_size = 12)

R2 adjusted vs. non-adjusted

p <- ggplot(expos_wide, aes(rsq_base, rsq_adjusted))
p <- p + geom_point(shape='.') + xlab("R2 Base [Demographics]") + ylab("R2 [Exposure + Demographics]")
p <- p + theme_bw()
p

p <- ggplot(expos_wide, aes(rsq_adjusted, rsq_unadjusted))
p <- p + geom_point(shape='.') + xlab("R2 Adjusted [Exposure + Demographics]") + ylab("R2 Unadjusted [Exposure]")
p <- p + theme_bw() 
p

p <- ggplot(adjusted_meta_2, aes(mean_adjusted_base_r2_diff, i.squared))
p <- p + geom_point(shape='.') + xlab("R2 Adjusted [Exposure + Demographics]") + ylab("i.squared") + scale_x_continuous(limits=c(0, .01))
p <- p + theme_bw() + facet_grid(~sig_levels)
p
## Warning: Removed 17457 rows containing missing values (geom_point).

Estimate and P-values: adjusted vs. non-adjusted

p <- ggplot(expos_wide, aes(estimate_unadjusted, estimate_adjusted))
p <- p + geom_point(shape='.') + scale_x_continuous(limits=c(-2, 2)) + scale_y_continuous(limits=c(-2, 2)) + xlab("Adjusted [Exposure + Demographics]") + ylab("Unadjusted [Exposure]") + geom_abline()
p <- p + geom_smooth(method="lm")
p <- p + theme_bw()
p
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 193 rows containing non-finite values (stat_smooth).
## Warning: Removed 193 rows containing missing values (geom_point).

## how much bias?
tidy(lm(estimate_adjusted  ~ estimate_unadjusted, expos_wide)) # biased to be lower (intercept is negative, and slope is less than 1)
## # A tibble: 2 × 5
##   term                estimate std.error statistic p.value
##   <chr>                  <dbl>     <dbl>     <dbl>   <dbl>
## 1 (Intercept)         -0.00859 0.0000994     -86.4       0
## 2 estimate_unadjusted  0.609   0.000657      928.        0
# 


p <- ggplot(expos_wide, aes(-log10(p.value_unadjusted), -log10(p.value_adjusted)))
p <- p + geom_point(shape='.', alpha=.1) + xlab("Adjusted model pvalue [Exposure + Demographics]") + ylab("Unadjusted Pvalue [Exposure]")
p <- p + geom_abline()
p <- p + theme_bw()
p
## Warning: Removed 17595 rows containing missing values (geom_point).

p <- ggplot(expos_wide, aes(statistic_unadjusted, statistic_adjusted))
p <- p + geom_point(shape='.', alpha=.1) + xlab("Adjusted model z [Exposure + Demographics]") + ylab("Unadjusted z [Exposure]")
p <- p + geom_abline() + scale_y_continuous(limit=c(-20, 20)) + scale_x_continuous(limits=c(-20,20))
p <- p + theme_bw()
p
## Warning: Removed 3161 rows containing missing values (geom_point).

p <- ggplot(adjusted_meta_2 , aes(estimate, estimate_unadjusted, color=sig_levels))
p <- p + geom_point(shape='.') + scale_x_continuous(limits=c(-1, 1)) + scale_y_continuous(limits=c(-1, 1)) + scale_color_aaas()
p <- p + geom_abline()
p <- p + facet_grid(~sig_levels) + xlab("Adjusted model estimate [Exposure + Demographics]") + ylab("Unadjusted estimate [Exposure]")
p <- p + geom_smooth(method="lm")
p <- p + theme_bw() +theme(legend.position = "none")
p
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 193 rows containing non-finite values (stat_smooth).
## Removed 193 rows containing missing values (geom_point).

tidy(lm(estimate ~ estimate_unadjusted, adjusted_meta_2))
## # A tibble: 2 × 5
##   term                estimate std.error statistic   p.value
##   <chr>                  <dbl>     <dbl>     <dbl>     <dbl>
## 1 (Intercept)         -0.00471  0.000177     -26.6 1.27e-155
## 2 estimate_unadjusted  0.464    0.00141      329.  0
tidy(lm(estimate ~ estimate_unadjusted, adjusted_meta_2 |> filter(pvalue_bonferroni < .05)))
## # A tibble: 2 × 5
##   term                estimate std.error statistic  p.value
##   <chr>                  <dbl>     <dbl>     <dbl>    <dbl>
## 1 (Intercept)          -0.0153  0.000877     -17.4 6.27e-67
## 2 estimate_unadjusted   0.631   0.00476      133.  0
p <- ggplot(adjusted_meta_2 , aes(estimate_unadjusted, estimate, color=ecategory))
p <- p + geom_point(shape='.') + scale_x_continuous(limits=c(-1, 1)) + scale_y_continuous(limits=c(-1, 1)) + scale_color_aaas()
p <- p + geom_abline()
p <- p + facet_grid(sig_levels~ecategory) + xlab("Adjusted model estimate [Exposure + Demographics]") + ylab("Unadjusted estimate [Exposure]")
p <- p + geom_smooth(method="lm")
p <- p + theme_bw() +theme(legend.position = "none")
p
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 193 rows containing non-finite values (stat_smooth).
## Removed 193 rows containing missing values (geom_point).

Which e and P domains are most subject to demographic bias?

bias_per_ecategory <- adjusted_meta_2 |> group_by(ecategory) |> summarize(
  mod=list(lm(estimate ~ estimate_unadjusted))) |> mutate(tidied=map(mod, tidy)) |> unnest(tidied)

bias_per_ecategory |> select(-mod) |> select(ecategory, term, estimate, p.value) |> filter(term == "estimate_unadjusted") |> select(-term)
## # A tibble: 9 × 3
##   ecategory         estimate  p.value
##   <chr>                <dbl>    <dbl>
## 1 allergy              0.753 2.65e-94
## 2 autoantibody         0.478 3.09e-64
## 3 housing              0.354 4.72e-81
## 4 income               0.725 0       
## 5 infection            0.557 0       
## 6 nutrients            0.287 0       
## 7 physical activity    0.415 0       
## 8 pollutant            0.386 0       
## 9 smoking              0.660 0
bias_per_pcategory <- adjusted_meta_2 |> group_by(pcategory) |> summarize(
  mod=list(lm(estimate ~ estimate_unadjusted))) |> mutate(tidied=map(mod, tidy)) |> unnest(tidied)

bias_per_pcategory |> select(-mod) |> select(pcategory, term, estimate, p.value) |> filter(term == "estimate_unadjusted") |> select(-term)
## # A tibble: 8 × 3
##   pcategory      estimate   p.value
##   <chr>             <dbl>     <dbl>
## 1 aging             0.300 9.28e- 16
## 2 anthropometric    0.347 0        
## 3 biochemistry      0.665 0        
## 4 blood             0.675 0        
## 5 blood pressure    0.514 0        
## 6 fitness           0.637 3.56e-159
## 7 lung              0.379 8.71e-193
## 8 microbiome        0.791 0

Multivariate R2 of the exposome

p <- ggplot(mvr2, aes(n_evars, mv))
p <- p + geom_point() + theme_bw()
p <- p + geom_text(data=mvr2 |> filter(n_evars > 40),aes(n_evars, mv, label=pvarname) )
p <- p + xlab("Number of Exposome Variables in Model") + ylab("R-squared")
p

mvr2 |> collect() |> summarize(mean_sample_size=mean(n), median_r2=median(mv), q25_r2=quantile(mv, probs=.25), q75_r2=quantile(mv, probs=.75))
## # A tibble: 1 × 4
##   mean_sample_size median_r2  q25_r2 q75_r2
##              <dbl>     <dbl>   <dbl>  <dbl>
## 1           16621.    0.0153 0.00458 0.0365

Correlation of phenotypes in exposome space

library(corrr)
library(gplots)
## 
## Attaching package: 'gplots'
## The following object is masked from 'package:stats':
## 
##     lowess
to_array <- adjusted_meta_2 |> filter(expo_name == 'expo', vartype == 'continuous') |> select(evarname, pvarname, estimate, p.value) |> mutate(estimate= ifelse(p.value >= 0.05, 0, estimate)) |> mutate(estimate = ifelse(is.na(estimate), 0, estimate))  |> select(-p.value) |> pivot_wider(names_from = pvarname, values_from = estimate)

exposure_correlation <- to_array |> select(-evarname) |> correlate(diagonal = 1)
## Warning in stats::cor(x = x, y = y, use = use, method = method): the standard
## deviation is zero
## Correlation computed with
## • Method: 'pearson'
## • Missing treated using: 'pairwise.complete.obs'
m <- exposure_correlation |> as_matrix()
m[is.na(m)] <- 0

heatmapColors <- function(numColors=16) {
    c1 <- rainbow(numColors,v=seq(0.5,1,length=numColors),s=seq(1,0.3,length=numColors),start=4/6,end=4.0001/6);
    c2 <- rainbow(numColors,v=seq(0.5,1,length=numColors),s=seq(1,0.3,length=numColors),start=1/6,end=1.0001/6);
    c3 <- c(c1,rev(c2)); 
    return(c3)
}

heatmap.2(m, trace = 'none', na.rm = F, scale = 'none', symm = T, col=heatmapColors(5), margins=c(16,16), sepwidth=c(.1, .1), symbreaks=T)

Correlation of the exposome

to_array <- adjusted_meta_2 |> filter(expo_name == 'expo', vartype == 'continuous') |> select(evarname, pvarname, estimate, p.value) |> mutate(estimate= ifelse(p.value >= 0.05, 0, estimate)) |> mutate(estimate = ifelse(is.na(estimate), 0, estimate))  |> select(-p.value) |> pivot_wider(names_from = evarname, values_from = estimate)

phenome_correlation <- to_array |> select(-pvarname) |> correlate(diagonal = 1)
## Warning in stats::cor(x = x, y = y, use = use, method = method): the standard
## deviation is zero
## Correlation computed with
## • Method: 'pearson'
## • Missing treated using: 'pairwise.complete.obs'
m <- phenome_correlation |> as_matrix()
m[is.na(m)] <- 0

heatmap.2(m, trace = 'none', na.rm = F, scale = 'none', symm = T, col=heatmapColors(5), margins=c(16,16), sepwidth=c(.1, .1), symbreaks=T)